import urllib.request
import re


#爬取CSDN博客首页所有新闻内容
url="http://blog.csdn.net/"
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4295.400 QQBrowser/9.7.12661.400")
try:
    opener=urllib.request.build_opener()
    opener.addhearders=[headers]
    pat='<a strategy=".*?" href="(.*?)"'
    #pat_blogname='<a strategy=[^<]*>(.*?)</a>'
    pat_blogname='<a strategy=".*?" href=".*?" target="_blank">(.*?)</a>'
    data=opener.open(url).read().decode("utf-8","ignore")   #ignore代表编码出错继续编码
    alllink=re.compile(pat).findall(data)
    blogname=[""]*len(alllink)                            #获取链接的数量，初始化blogname列表
    blogname=re.compile(pat_blogname,re.S).findall(data)  #re.S模式修正符，修正.匹配换行符/n
    for z in range(0,len(blogname)):
        blogname[z]=blogname[z].replace(" ","").replace("\n","").replace("|","")   #取出空格、换行符和|符号
    print(blogname)
    #print(data)
    #for j in range(0,len(blogname)):
        #blogname[j]=namedata[j]
        #print(blogname[j])
    #for k in range(0,len(alllink)):
        #print(alllink[k])
    print("一共有"+str(len(alllink))+"篇文章！")
    for i in range(0,len(alllink)):
        thislink=alllink[i]
        thispage=urllib.request.urlopen(thislink).read().decode("utf-8","ignore")
        urllib.request.urlretrieve(thislink,"D:/spider/"+str(i+1)+"、"+blogname[i]+".html")
        #print("已爬取"+str(i+1)+"个网页......")
except urllib.error.URLError as e:
    if hasattr(e,"code"):
        print("错误代码是："+str(e.code))
    if hasattr(e,"reason"):
        print(e.reason)